{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"..\")\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"You say goodbye and I say hello.\"\n",
    "words = text.lower().replace(\".\", \" .\").split()\n",
    "words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_to_id = {}\n",
    "id_to_word = {}\n",
    "\n",
    "for word in words:\n",
    "    if word not in word_to_id:\n",
    "        new_id = len(word_to_id)\n",
    "        word_to_id[word] = new_id\n",
    "        id_to_word[new_id] = word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(word_to_id)\n",
    "print(id_to_word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = [word_to_id[w] for w in words]\n",
    "corpus = np.array(corpus)\n",
    "corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(text):\n",
    "    words = text.lower().replace(\".\", \" .\").split()\n",
    "    \n",
    "    word_to_id = {}\n",
    "    id_to_word = {}\n",
    "    \n",
    "    for word in words:\n",
    "        if word not in word_to_id:\n",
    "            new_id = len(word_to_id)\n",
    "            word_to_id[word] = new_id\n",
    "            id_to_word[new_id] = word\n",
    "    \n",
    "    corpus = np.array([word_to_id[w] for w in words])\n",
    "    \n",
    "    return corpus, word_to_id, id_to_word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from common.util import preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# text = \"You say goodbye and I say hello.\"\n",
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "print(corpus)\n",
    "print(id_to_word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_co_matrix(corpus, vocab_size, window_size=1):\n",
    "    corpus_size = len(corpus)\n",
    "    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)\n",
    "    \n",
    "    for idx, word_id in enumerate(corpus):\n",
    "        for i in range(1, window_size + 1):\n",
    "            left_idx = idx - i\n",
    "            right_idx = idx + i\n",
    "            \n",
    "            if left_idx >= 0:\n",
    "                left_word_id = corpus[left_idx]\n",
    "                co_matrix[word_id, left_word_id] += 1\n",
    "            \n",
    "            if right_idx < corpus_size:\n",
    "                right_word_id = corpus[right_idx]\n",
    "                co_matrix[word_id, right_word_id] += 1\n",
    "    \n",
    "    return co_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cos_similarity(x, y, eps=1e-8):\n",
    "    nx = x / np.sqrt(np.sum(x ** 2) + eps)\n",
    "    ny = y / np.sqrt(np.sum(y ** 2) + eps)\n",
    "    return np.dot(nx, ny)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from common.util import create_co_matrix, cos_similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# text = \"You say goodbye and I say hello.\"\n",
    "# corpus, word_to_id, id_to_word = preprocess(text)\n",
    "vocab_size = len(word_to_id)\n",
    "C = create_co_matrix(corpus, vocab_size)\n",
    "\n",
    "c0 = C[word_to_id[\"you\"]]\n",
    "c1 = C[word_to_id[\"i\"]]\n",
    "print(cos_similarity(c0, c1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):\n",
    "    # get query\n",
    "    if query not in word_to_id:\n",
    "        print(f\"{query} is not found\")\n",
    "        return\n",
    "    \n",
    "    print(\"[query]\", query)\n",
    "    query_id = word_to_id[query]\n",
    "    query_vec = word_matrix[query_id]\n",
    "    \n",
    "    # compute cosine similarity\n",
    "    vocab_size = len(id_to_word)\n",
    "    similarity = np.zeros(vocab_size)\n",
    "    for i in range(vocab_size):\n",
    "        similarity[i] = cos_similarity(word_matrix[i], query_vec)\n",
    "    \n",
    "    # output top 5 cosine similarity word\n",
    "    count = 0\n",
    "    for i in (-1 * similarity).argsort():\n",
    "        if id_to_word[i] == query:\n",
    "            continue\n",
    "        print(f\" {id_to_word[i]}: {similarity[i]}\")\n",
    "        \n",
    "        count += 1\n",
    "        if count >= top:\n",
    "            return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# text = \"You say goodbye and I say hello.\"\n",
    "# corpus, word_to_id, id_to_word = preprocess(text)\n",
    "# vocab_size = len(word_to_id)\n",
    "# C = create_co_matrix(corpus, vocab_size)\n",
    "\n",
    "most_similar(\"you\", word_to_id, id_to_word, C, top=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ppmi(C, verbose=False, eps=1e-8):\n",
    "    M = np.zeros_like(C, dtype=np.float32)\n",
    "    N = np.sum(C)\n",
    "    S = np.sum(C, axis=0)\n",
    "    total = C.shape[0] * C.shape[1]\n",
    "    cnt = 0\n",
    "    \n",
    "    for i in range(C.shape[0]):\n",
    "        for j in range(C.shape[1]):\n",
    "            pmi = np.log2(C[i, j] * N / (S[j] * S[i]) + eps)\n",
    "            M[i, j] = max(0, pmi)\n",
    "            \n",
    "            if verbose:\n",
    "                cnt += 1\n",
    "                if cnt % (total // 100) == 0:\n",
    "                    print(f\"{100 * cnt / total}% done\")\n",
    "    return M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from common.util import ppmi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "W = ppmi(C)\n",
    "\n",
    "np.set_printoptions(precision=3)\n",
    "print(\"covariance matrix\")\n",
    "print(C)\n",
    "print(\"-\" * 50)\n",
    "print(\"PPMI\")\n",
    "print(W)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"You say goodbye and I say hello.\"\n",
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "vocab_size = len(word_to_id)\n",
    "C = create_co_matrix(corpus, vocab_size, window_size=1)\n",
    "W = ppmi(C)\n",
    "U, S, V = np.linalg.svd(W)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(C[0], W[0], U[0], sep=\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for word, word_id in word_to_id.items():\n",
    "    plt.annotate(word, (U[word_id, 0], U[word_id, 1]))\n",
    "\n",
    "plt.scatter(U[:, 0], U[:, 1], alpha=0.5)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python3 show_ptb.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataset import ptb\n",
    "\n",
    "corpus, word_to_id, id_to_word = ptb.load_data(\"train\")\n",
    "\n",
    "print(\"corpus size:\", len(corpus))\n",
    "print(\"corpus[:30]:\", corpus[:30])\n",
    "print()\n",
    "print(\"id_to_word[0]:\", id_to_word[0])\n",
    "print(\"id_to_word[1]:\", id_to_word[1])\n",
    "print(\"id_to_word[2]:\", id_to_word[2])\n",
    "print()\n",
    "print(\"word_to_id['car']:\", word_to_id[\"car\"])\n",
    "print(\"word_to_id['happy']:\", word_to_id[\"happy\"])\n",
    "print(\"word_to_id['lexus']:\", word_to_id[\"lexus\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils.extmath import randomized_svd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "window_size = 2\n",
    "wordvec_size = 100\n",
    "\n",
    "corpus, word_to_id, id_to_word = ptb.load_data(\"train\")\n",
    "vocab_size = len(word_to_id)\n",
    "print(\"counting co-occurrence ...\")\n",
    "C = create_co_matrix(corpus, vocab_size, window_size)\n",
    "print(\"calculating PPMI ...\")\n",
    "W = ppmi(C, verbose=True)\n",
    "\n",
    "print(\"calculating SVD ...\")\n",
    "U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vecs = U[:, :wordvec_size]\n",
    "querys = [\"you\", \"year\", \"car\", \"toyota\"]\n",
    "for query in querys:\n",
    "    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
